# -*-coding:utf-8-*-
__author__ = 'Administrator'

##
# 1.获取第一页所有的地址
# 2.跳转下一页
# 3.继续获取
##

from bs4 import BeautifulSoup
import urllib
import urllib2
import re
import urlparse

new_urls = set()
old_urls = set()

root_url = 'http://www.5iweb.com.cn' #目标网站

# 获取第一页
request = urllib2.Request(root_url)
response = urllib2.urlopen(request)
content = response.read()

#获取页面就的地址
soup  = BeautifulSoup(content,'html.parser',from_encoding='utf-8')

links = soup.find_all("a",href=re.compile(r"^/.+"))
#print links
#旧URL
#新ULR
old_urls = set()
new_urls = set()

for link in links:
    new_url = link['href']
    if new_url not in old_urls:
        old_urls.add(new_url)
#print link['href']

test_url = '/html5-css3-effects_c/494.html'
if test_url not in old_urls:
    print old_urls

#links = soup.find_all("a",href=re.compile(r"\d+\.htm"))

